# Load of libraries
library(readr) # Read cvs
library(dplyr) # Handle data and use of pipes
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr) # Handle dataframes
library(ggplot2) # Useful library to visualizations
library(lubridate) # Handle with dates
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(randomForest) # Random Forest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(Metrics) # Use of RMSE metric
library(ggcorrplot) # Correlation plot
library(caret) # ML library
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
library(scales) # To rescale output values
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(xts)
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tictoc)

rm(list = ls(all.names = TRUE))

setwd("/Users/pablosalarcarrera/Desktop/Master Classes/DS636/")

# Load datasets
sales_train <- read.csv("sales_train.csv", header=T)
items <- read.csv("items.csv", header=T)
items_categories <- read.csv("item_categories.csv", header=T)
shops <- read.csv("shops.csv", header=T)

# Print first 5 rows of each dataset
head(sales_train)
##         date date_block_num shop_id item_id item_price item_cnt_day
## 1 02.01.2013              0      59   22154     999.00            1
## 2 03.01.2013              0      25    2552     899.00            1
## 3 05.01.2013              0      25    2552     899.00           -1
## 4 06.01.2013              0      25    2554    1709.05            1
## 5 15.01.2013              0      25    2555    1099.00            1
## 6 10.01.2013              0      25    2564     349.00            1
head(items)
##                                                              item_name item_id
## 1                            ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D       0
## 2 !ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]       1
## 3                        ***В ЛУЧАХ СЛАВЫ   (UNV)                    D       2
## 4                      ***ГОЛУБАЯ ВОЛНА  (Univ)                      D       3
## 5                          ***КОРОБКА (СТЕКЛО)                       D       4
## 6                  ***НОВЫЕ АМЕРИКАНСКИЕ ГРАФФИТИ  (UNI)             D       5
##   item_category_id
## 1               40
## 2               76
## 3               40
## 4               40
## 5               40
## 6               40
head(items_categories)
##        item_category_name item_category_id
## 1 PC - Гарнитуры/Наушники                0
## 2        Аксессуары - PS2                1
## 3        Аксессуары - PS3                2
## 4        Аксессуары - PS4                3
## 5        Аксессуары - PSP                4
## 6     Аксессуары - PSVita                5
head(shops)
##                        shop_name shop_id
## 1  !Якутск Орджоникидзе, 56 фран       0
## 2  !Якутск ТЦ "Центральный" фран       1
## 3               Адыгея ТЦ "Мега"       2
## 4 Балашиха ТРК "Октябрь-Киномир"       3
## 5       Волжский ТЦ "Волга Молл"       4
## 6         Вологда ТРЦ "Мармелад"       5
# Merge datasets into a unique dataset 
sales <- sales_train %>% left_join(items,by = "item_id") %>% left_join(items_categories,by = "item_category_id") %>% left_join(shops,by = "shop_id")
# Fix the format of date column
sales$date <- as.Date(sales$date,"%d.%m.%Y")

# We may want to know sales in terms of day, month, weekday and year, so create four new columns.
sales$day = day(sales$date)
sales$month = month(sales$date)
sales$weekday = wday(sales$date)
sales$year = year(sales$date)
# Transform IDs into categorical variables
sales$item_id = factor(sales$item_id)
sales$item_category_id = factor(sales$item_category_id)
sales$shop_id = factor(sales$shop_id)

# Print first rows of the new dataset
head(sales)
##         date date_block_num shop_id item_id item_price item_cnt_day
## 1 2013-01-02              0      59   22154     999.00            1
## 2 2013-01-03              0      25    2552     899.00            1
## 3 2013-01-05              0      25    2552     899.00           -1
## 4 2013-01-06              0      25    2554    1709.05            1
## 5 2013-01-15              0      25    2555    1099.00            1
## 6 2013-01-10              0      25    2564     349.00            1
##                                            item_name item_category_id
## 1                                  ЯВЛЕНИЕ 2012 (BD)               37
## 2           DEEP PURPLE  The House Of Blue Light  LP               58
## 3           DEEP PURPLE  The House Of Blue Light  LP               58
## 4           DEEP PURPLE  Who Do You Think We Are  LP               58
## 5            DEEP PURPLE 30 Very Best Of 2CD (Фирм.)               56
## 6 DEEP PURPLE Perihelion: Live In Concert DVD (Кир.)               59
##                    item_category_name              shop_name day month weekday
## 1                      Кино - Blu-Ray Ярославль ТЦ "Альтаир"   2     1       4
## 2                      Музыка - Винил    Москва ТРК "Атриум"   3     1       5
## 3                      Музыка - Винил    Москва ТРК "Атриум"   5     1       7
## 4                      Музыка - Винил    Москва ТРК "Атриум"   6     1       1
## 5 Музыка - CD фирменного производства    Москва ТРК "Атриум"  15     1       3
## 6          Музыка - Музыкальное видео    Москва ТРК "Атриум"  10     1       5
##   year
## 1 2013
## 2 2013
## 3 2013
## 4 2013
## 5 2013
## 6 2013
# Check missing values
sum(is.na(sales))
## [1] 0
sum(duplicated(sales))
## [1] 6
which(duplicated(sales))
## [1]   76963 1435368 1496767 1671874 1866341 2198567
# Eliminate duplicated values
sales <- sales[!duplicated(sales), ]

# Check lowest value in item price
min(sales$item_price)
## [1] -1
# Price cannot be negative 
sales <- sales[sales$item_price>=0, ]

boxplot(sales$`item_price`,
        ylab = "Item price"
)

# Deal with outliers
sales <- sales[sales$item_price < 100000, ]
boxplot(sales$`item_price`,
        ylab = "Item price"
)

sales <- sales[sales$item_price < 40000, ]
boxplot(sales$`item_price`,
        ylab = "Item price"
)

glimpse(sales)
## Rows: 2,935,828
## Columns: 14
## $ date               <date> 2013-01-02, 2013-01-03, 2013-01-05, 2013-01-06, 20…
## $ date_block_num     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ shop_id            <fct> 59, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,…
## $ item_id            <fct> 22154, 2552, 2552, 2554, 2555, 2564, 2565, 2572, 25…
## $ item_price         <dbl> 999.00, 899.00, 899.00, 1709.05, 1099.00, 349.00, 5…
## $ item_cnt_day       <dbl> 1, 1, -1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 1, 2, 1,…
## $ item_name          <chr> "ЯВЛЕНИЕ 2012 (BD)", "DEEP PURPLE  The House Of Blu…
## $ item_category_id   <fct> 37, 58, 58, 58, 56, 59, 56, 55, 55, 55, 55, 55, 55,…
## $ item_category_name <chr> "Кино - Blu-Ray", "Музыка - Винил", "Музыка - Винил…
## $ shop_name          <chr> "Ярославль ТЦ \"Альтаир\"", "Москва ТРК \"Атриум\""…
## $ day                <int> 2, 3, 5, 6, 15, 10, 2, 4, 11, 3, 3, 5, 7, 8, 10, 11…
## $ month              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ weekday            <dbl> 4, 5, 7, 1, 3, 5, 4, 6, 6, 5, 5, 7, 2, 3, 5, 6, 1, …
## $ year               <dbl> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 201…
# Remove the columns with the names
sales <- sales %>% select(-item_name, -item_category_name, -shop_name)
glimpse(sales)
## Rows: 2,935,828
## Columns: 11
## $ date             <date> 2013-01-02, 2013-01-03, 2013-01-05, 2013-01-06, 2013…
## $ date_block_num   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ shop_id          <fct> 59, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2…
## $ item_id          <fct> 22154, 2552, 2552, 2554, 2555, 2564, 2565, 2572, 2572…
## $ item_price       <dbl> 999.00, 899.00, 899.00, 1709.05, 1099.00, 349.00, 549…
## $ item_cnt_day     <dbl> 1, 1, -1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 1, 2, 1, 1…
## $ item_category_id <fct> 37, 58, 58, 58, 56, 59, 56, 55, 55, 55, 55, 55, 55, 5…
## $ day              <int> 2, 3, 5, 6, 15, 10, 2, 4, 11, 3, 3, 5, 7, 8, 10, 11, …
## $ month            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ weekday          <dbl> 4, 5, 7, 1, 3, 5, 4, 6, 6, 5, 5, 7, 2, 3, 5, 6, 1, 4,…
## $ year             <dbl> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(sales)
## sales 
## 
##  11  Variables      2935828  Observations
## --------------------------------------------------------------------------------
## date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##    2935828          0       1034          1 2014-04-03      330.2 2013-02-09 
##        .10        .25        .50        .75        .90        .95 
## 2013-03-17 2013-08-01 2014-03-04 2014-12-05 2015-05-20 2015-08-09 
## 
## lowest : 2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05
## highest: 2015-10-27 2015-10-28 2015-10-29 2015-10-30 2015-10-31
## --------------------------------------------------------------------------------
## date_block_num 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##  2935828        0       34    0.999    14.57    10.84        1        2 
##      .25      .50      .75      .90      .95 
##        7       14       23       28       31 
## 
## lowest :  0  1  2  3  4, highest: 29 30 31 32 33
## --------------------------------------------------------------------------------
## shop_id 
##        n  missing distinct 
##  2935828        0       60 
## 
## lowest : 0  1  2  3  4 , highest: 55 56 57 58 59
## --------------------------------------------------------------------------------
## item_id 
##        n  missing distinct 
##  2935828        0    21802 
## 
## lowest : 0     1     2     3     4    , highest: 22165 22166 22167 22168 22169
## --------------------------------------------------------------------------------
## item_price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##  2935828        0    19983    0.998    890.6     1025       99      149 
##      .25      .50      .75      .90      .95 
##      249      399      999     1999     2690 
## 
## lowest :     0.0700     0.0875     0.0900     0.1000     0.2000
## highest: 35490.0000 35990.0000 35991.0000 36990.0000 37991.0000
## --------------------------------------------------------------------------------
## item_cnt_day 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##  2935828        0      198    0.281    1.243   0.4806        1        1 
##      .25      .50      .75      .90      .95 
##        1        1        1        2        2 
## 
## lowest :  -22  -16   -9   -6   -5, highest:  624  637  669 1000 2169
## --------------------------------------------------------------------------------
## item_category_id 
##        n  missing distinct 
##  2935828        0       84 
## 
## lowest : 0  1  2  3  4 , highest: 79 80 81 82 83
## --------------------------------------------------------------------------------
## day 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##  2935828        0       31    0.999    15.85     10.3        2        3 
##      .25      .50      .75      .90      .95 
##        8       16       24       28       30 
## 
## lowest :  1  2  3  4  5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##  2935828        0       12    0.993    6.248    4.065        1        1 
##      .25      .50      .75      .90      .95 
##        3        6        9       11       12 
## 
## lowest :  1  2  3  4  5, highest:  8  9 10 11 12
##                                                                          
## Value           1      2      3      4      5      6      7      8      9
## Frequency  303559 270250 284055 228289 224834 237428 234856 248415 219881
## Proportion  0.103  0.092  0.097  0.078  0.077  0.081  0.080  0.085  0.075
##                                
## Value          10     11     12
## Frequency  227068 183163 274030
## Proportion  0.077  0.062  0.093
## --------------------------------------------------------------------------------
## weekday 
##        n  missing distinct     Info     Mean      Gmd 
##  2935828        0        7    0.977    4.166     2.45 
## 
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##                                                            
## Value           1      2      3      4      5      6      7
## Frequency  503102 337074 345767 352960 367272 439296 590357
## Proportion  0.171  0.115  0.118  0.120  0.125  0.150  0.201
## --------------------------------------------------------------------------------
## year 
##        n  missing distinct     Info     Mean      Gmd 
##  2935828        0        3    0.864     2014   0.8209 
##                                   
## Value         2013    2014    2015
## Frequency  1267557 1055854  612417
## Proportion   0.432   0.360   0.209
## --------------------------------------------------------------------------------
# Check how many different items
sales %>% select(item_id) %>% distinct() %>% count()
##       n
## 1 21802
# We can extract the numbers of shops, products and categories
print(paste0("numbers of shops: ",length(unique(sales$shop_id))))
## [1] "numbers of shops: 60"
print(paste0("numbers of products: ",length(unique(sales$item_id))))
## [1] "numbers of products: 21802"
print(paste0("numbers of categories: ",length(unique(sales$item_category_id))))
## [1] "numbers of categories: 84"
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reactable)
library(htmlwidgets)
library('IRdisplay')
library(htmltools)

product_sold_Year<-sales%>%group_by(year)%>%summarise(Product_sold_year=sum(item_cnt_day))
product_sold_Year
## # A tibble: 3 × 2
##    year Product_sold_year
##   <dbl>             <dbl>
## 1  2013           1562728
## 2  2014           1320882
## 3  2015            764575
v1 <- plot_ly(product_sold_Year, x = ~year, y = ~Product_sold_year, color = ~year, colors = c("#4C3A51", "#774360", "#B25068"), type = 'bar') %>%
  layout(title = 'Total Product Sold Per Year', yaxis = list(title = 'Total Item Sold'))

v1
## Warning: textfont.color doesn't (yet) support data arrays

## Warning: textfont.color doesn't (yet) support data arrays
sales_price_Year<-sales%>%group_by(year)%>%summarise(Sales_value_year=sum(item_price*item_cnt_day))
sales_price_Year
## # A tibble: 3 × 2
##    year Sales_value_year
##   <dbl>            <dbl>
## 1  2013      1217115406.
## 2  2014      1346682085.
## 3  2015       834234429.
v4<-plot_ly(sales_price_Year, x=~year, y=~Sales_value_year, color = ~year, colors = c("#764AF1","#9772FB","#F2F2F2"), type='bar')%>% layout(title='Total Sales Value Per Year', yaxis=list(title='Total Sales Value'))

v4
## Warning: textfont.color doesn't (yet) support data arrays

## Warning: textfont.color doesn't (yet) support data arrays
data_aggr1 <- aggregate(cbind(item_price, item_cnt_day) ~ month + year, sales, FUN = sum)
head(data_aggr1)
##   month year item_price item_cnt_day
## 1     1 2013   82211725       131478
## 2     2 2013   75580187       128090
## 3     3 2013   84298312       147142
## 4     4 2013   61512823       107190
## 5     5 2013   57274133       106969
## 6     6 2013   63343615       125381
# Convert the revenue per month to millions
data_aggr1$item_price <- round(data_aggr1$item_price/1000000)
# Convert the items sold per month to thousands
data_aggr1$item_cnt_day <- round(data_aggr1$item_cnt_day/1000)

colnames(data_aggr1) <- c('month','year','Revenue (in millions $)', 'Number of items sold (thousands)')
head(data_aggr1)
##   month year Revenue (in millions $) Number of items sold (thousands)
## 1     1 2013                      82                              131
## 2     2 2013                      76                              128
## 3     3 2013                      84                              147
## 4     4 2013                      62                              107
## 5     5 2013                      57                              107
## 6     6 2013                      63                              125
tail(data_aggr1)
##    month year Revenue (in millions $) Number of items sold (thousands)
## 29     5 2015                      59                               72
## 30     6 2015                      56                               64
## 31     7 2015                      54                               63
## 32     8 2015                      54                               66
## 33     9 2015                      59                               73
## 34    10 2015                      65                               71
forecasting_revenue <- data_aggr1 %>% select(`Revenue (in millions $)`)

df1 <- ts(forecasting_revenue,frequency=12,start=c(2013,1),end=c(2015,10))

head(df1)
##      Revenue (in millions $)
## [1,]                      82
## [2,]                      76
## [3,]                      84
## [4,]                      62
## [5,]                      57
## [6,]                      63
library(ggfortify)
library(forecast)
## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:Metrics':
## 
##     accuracy
revenue_yearly_plot <- autoplot(df1)+geom_smooth(method='loess') + ggtitle('Revenue of Sales of Products Yearly')+xlab('Year') +ylab('$ Millions')

revenue_seasonal_plot <- ggseasonplot(df1, year.labels=TRUE, year.labels.left=TRUE) +
  ylab("$ million") +
  ggtitle("Seasonal plot: Product Sales Revenue")

revenue_seasonal_plot2 <- ggseasonplot(df1, polar=TRUE) +
  ylab("$ million") +
  ggtitle("Seasonal plot: Sale of Products Revenue")

forecasting_items_sold <- data_aggr1 %>% select(`Number of items sold (thousands)`)

df2 <- ts(forecasting_items_sold,frequency=12,start=c(2013,1),end=c(2015,10))

head(df2)
##      Jan Feb Mar Apr May Jun
## 2013 131 128 147 107 107 125
items_soldyearly_plot <- autoplot(df2)+geom_smooth(method='loess') + ggtitle('Sales of Products Yearly')+xlab('Year') +ylab('Items sold (Thousand)')
items_soldseasonal_plot <- ggseasonplot(df2, year.labels=TRUE, year.labels.left=TRUE) +
  ylab("Items sold (Thousand)") +
  ggtitle("Seasonal plot: Product Sales")

items_soldseasonal_plot2 <- ggseasonplot(df2, polar=TRUE) +
  ylab("Items sold (Thousand)") +
  ggtitle("Seasonal plot: Sale of Products")

options(repr.plot.width = 14, repr.plot.height = 8)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(revenue_yearly_plot, items_soldyearly_plot, nrow=2, ncol=1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

grid.arrange(revenue_seasonal_plot, items_soldseasonal_plot, nrow=2, ncol=1)

grid.arrange(revenue_seasonal_plot2, items_soldseasonal_plot2, nrow=2, ncol=1)

# Check how many years we have in the data
sales %>% select(year) %>% distinct() %>% count()
##   n
## 1 3
year_sales = sales%>%group_by(year)%>%
  summarise(sales_per_year = sum(item_price*item_cnt_day)) %>%
  arrange(year)%>%
  ungroup()
year_sales_plot <- ggplot(data=year_sales, aes(x = year, y = sales_per_year, fill = as.factor(year))) + 
  geom_bar(stat = "identity") + 
  labs(title= "Revenue Sales per Year", x= "Year", y = "Total Sales per Year", fill = "Year")


year_month_sales = sales%>%group_by(year, month)%>%
  summarise(total_sales = sum(item_price*item_cnt_day)) %>%
  arrange(year)%>%
  ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
year_month_sales_plot <- ggplot(data=year_month_sales, aes(x = year, y = total_sales, fill = as.factor(month))) + 
  geom_bar(stat = "identity") + 
  labs(title= "Year-Month Revenue Sales", x= "Year", y = "Year-Month Revenue Sales", fill = "Month")
# We want to forecast sales for month 11 and 12


highest_sales_day = sales%>%group_by(date)%>%
  summarise(sales_per_day = sum(item_price*item_cnt_day)) %>%
  arrange(desc(sales_per_day))%>%
  ungroup()
highest_sales_day_plot <- ggplot(data=highest_sales_day, aes(x = date, y = sales_per_day)) + 
  geom_point(na.rm = TRUE, color = "blue", size = 0.3) + 
  (scale_x_date(breaks = date_breaks("7 months"), labels = date_format("%b %y"))) +
  labs(title= "Count of Items Sold per Day", x= "Date(s)", y = "Total sales per day")


grid.arrange(year_sales_plot, year_month_sales_plot, nrow=2, ncol=1)

highest_sales_day_plot

# Decomposing the time series
df_revenue <- decompose(df1)
plot(df_revenue)

df_n_sales <- decompose(df2)
plot(df_n_sales)

# Testing whether the time series is stationary or not
# Augmented Dickey-Fuller Test (adf test). A p-Value of less than 0.05 in adf.test() indicates that it is stationary.
adf.test(df1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df1
## Dickey-Fuller = -2.5878, Lag order = 3, p-value = 0.3452
## alternative hypothesis: stationary
adf.test(df2)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df2
## Dickey-Fuller = -2.7272, Lag order = 3, p-value = 0.2912
## alternative hypothesis: stationary
# Load test data
sales_test <- read.csv("test.csv", header=T)

head(sales_test)
##   ID shop_id item_id
## 1  0       5    5037
## 2  1       5    5320
## 3  2       5    5233
## 4  3       5    5232
## 5  4       5    5268
## 6  5       5    5039
sales_test %>% select(item_id) %>% distinct() %>% count()
##      n
## 1 5100
# Function to retrieve item_category_id based on item_id
get_item_category_id <- function(item_id, items) {
  category_id <- items$item_category_id[items$item_id == item_id]
  if (length(category_id) == 0) {
    return(NA) # Return NA if item_name not found
  } else {
    return(category_id)
  }
}

# Apply the function to the 'sales_test' dataframe
sales_test$item_category_id <- sapply(sales_test$item_id, get_item_category_id, items = items)

head(sales_test)
##   ID shop_id item_id item_category_id
## 1  0       5    5037               19
## 2  1       5    5320               55
## 3  2       5    5233               19
## 4  3       5    5232               23
## 5  4       5    5268               20
## 6  5       5    5039               23
sales_test$shop_id <- factor(sales_test$shop_id)
sales_test$item_category_id <- factor(sales_test$item_category_id)
head(sales_test)
##   ID shop_id item_id item_category_id
## 1  0       5    5037               19
## 2  1       5    5320               55
## 3  2       5    5233               19
## 4  3       5    5232               23
## 5  4       5    5268               20
## 6  5       5    5039               23
## Linear Regression Model for Number of Sales####
lr_items_model <- lm(item_cnt_day ~ shop_id + item_category_id, data = sales)

# Make predictions on sales_test data
sales_test_predictions <- predict(lr_items_model, newdata = sales_test)

total_sales <- sum(sales_test_predictions)
total_sales <- total_sales/1000
total_sales
## [1] 146.4998
## Linear Regression Model for Revenue ####
lr_revenue_model <- lm(item_price ~ shop_id + item_category_id, data = sales)

# Make predictions on sales_test data
sales_test_predictions_revenue <- predict(lr_revenue_model, newdata = sales_test)
total_revenue <- sum(sales_test_predictions_revenue)
total_revenue <- total_revenue/1000000
total_revenue
## [1] 197.2241
library(lightgbm)
## 
## Attaching package: 'lightgbm'
## The following object is masked from 'package:plotly':
## 
##     slice
## The following object is masked from 'package:dplyr':
## 
##     slice
# Lightgbm model for revenue prediction
# Define the training data
train_data_revenue <- lgb.Dataset(data = as.matrix(sales[, c("shop_id", "item_category_id")]),
                          label = sales$item_price)

# Set LightGBM parameters
params <- list(objective = "regression",
               metric = "rmse",
               num_leaves = 31,
               learning_rate = 0.05,
               feature_fraction = 0.9,
               bagging_fraction = 0.8,
               bagging_freq = 5,
               verbose = -1)

# Train the LightGBM model
lgb_model_revenue <- lgb.train(params = params,
                       data = train_data_revenue,
                       nrounds = 100)

# Make predictions on the sales_test dataset
revenue_test_predictions_lgb <- predict(lgb_model_revenue, as.matrix(sales_test[, c("shop_id", "item_category_id")]))
total_revenue_lgb <- sum(revenue_test_predictions_lgb)
total_revenue_lgb <- total_revenue_lgb/1000000
total_revenue_lgb
## [1] 191.8044
# Lightgbm model for number of sales prediction
# Define the training data
train_data_sales <- lgb.Dataset(data = as.matrix(sales[, c("shop_id", "item_category_id")]),
                          label = sales$item_cnt_day)

# Set LightGBM parameters
params <- list(objective = "regression",
               metric = "rmse",
               num_leaves = 31,
               learning_rate = 0.05,
               feature_fraction = 0.9,
               bagging_fraction = 0.8,
               bagging_freq = 5,
               verbose = -1)

# Train the LightGBM model
lgb_model_sales <- lgb.train(params = params,
                       data = train_data_sales,
                       nrounds = 100)

# Make predictions on the sales_test dataset
sales_test_predictions_lgb <- predict(lgb_model_sales, as.matrix(sales_test[, c("shop_id", "item_category_id")]))
total_sales_lgb <- sum(sales_test_predictions_lgb)
total_sales_lgb <- total_sales_lgb/1000
total_sales_lgb
## [1] 244.4263
# 1 year Time Series Predictions for Revenue 
df1_train <- head(df1, length(df1) - 9)
df1_test <- tail(df1, 9)

df1_test
##      Feb Mar Apr May Jun Jul Aug Sep Oct
## 2015  73  71  59  59  56  54  54  59  65
df1_train_ts <- ts(df1_train, start=c(2013, 1), end=c(2015, 10), frequency=12)

df1_test_ts <- ts(df1_test,start=c(2013, 1), end=c(2016, 10), frequency=12)

df1_naive_mod <- naive(df1_train_ts, h = 12)
summary(df1_naive_mod)
## 
## Forecast method: Naive method
## 
## Model Information:
## Call: naive(y = df1_train_ts, h = 12) 
## 
## Residual sd: 25.6119 
## 
## Error measures:
##                      ME     RMSE      MAE       MPE     MAPE     MASE
## Training set -0.2727273 25.61191 15.06061 -2.988435 16.35679 1.428161
##                    ACF1
## Training set -0.2676921
## 
## Forecasts:
##          Point Forecast       Lo 80    Hi 80       Lo 95    Hi 95
## Nov 2015             73  40.1770226 105.8230   22.801588 123.1984
## Dec 2015             73  26.5813002 119.4187    2.008725 143.9913
## Jan 2016             73  16.1489354 129.8511  -13.946200 159.9462
## Feb 2016             73   7.3540451 138.6460  -27.396824 173.3968
## Mar 2016             73  -0.3944088 146.3944  -39.247062 185.2471
## Apr 2016             73  -7.3995465 153.3995  -49.960496 195.9605
## May 2016             73 -13.8414356 159.8414  -59.812515 205.8125
## Jun 2016             73 -19.8373997 165.8374  -68.982550 214.9826
## Jul 2016             73 -25.4689323 171.4689  -77.595236 223.5952
## Aug 2016             73 -30.7953683 176.7954  -85.741317 231.7413
## Sep 2016             73 -35.8615006 181.8615  -93.489298 239.4893
## Oct 2016             73 -40.7021291 186.7021 -100.892400 246.8924
df1_test$naive <- 73
## Warning in df1_test$naive <- 73: Coercing LHS to a list
df1_test$naive<- as.integer(df1_test$naive)
df1_res_naive <- mape(df1_test$naive, df1_test_ts) * 100
df1_res_naive
## [1] 15.9321
plot(df1,  main="Forecast for Yearly/Monthly Revenue", xlab="Time", ylab="Millions of Dollars")

lines(df1_naive_mod$mean, col=4) #Naive method prediction

df1_se_model <- ses(df1_train_ts, h = 12)
summary(df1_se_model)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
##  ses(y = df1_train_ts, h = 12) 
## 
##   Smoothing parameters:
##     alpha = 0.5527 
## 
##   Initial states:
##     l = 78.8837 
## 
##   sigma:  24.4567
## 
##      AIC     AICc      BIC 
## 341.2244 342.0244 345.8035 
## 
## Error measures:
##                      ME     RMSE      MAE       MPE     MAPE     MASE
## Training set -0.5289182 23.72644 14.85497 -4.367053 16.45306 1.408661
##                    ACF1
## Training set 0.07488592
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80      Lo 95    Hi 95
## Nov 2015       68.94399 37.601535 100.2865  21.009839 116.8781
## Dec 2015       68.94399 33.132585 104.7554  14.175170 123.7128
## Jan 2016       68.94399 29.162537 108.7255   8.103506 129.7845
## Feb 2016       68.94399 25.554230 112.3338   2.585077 135.3029
## Mar 2016       68.94399 22.223775 115.6642  -2.508415 140.3964
## Apr 2016       68.94399 19.115427 118.7726  -7.262223 145.1502
## May 2016       68.94399 16.189910 121.6981 -11.736415 149.6244
## Jun 2016       68.94399 13.418318 124.4697 -15.975199 153.8632
## Jul 2016       68.94399 10.778644 127.1093 -20.012232 157.9002
## Aug 2016       68.94399  8.253672 129.6343 -23.873844 161.7618
## Sep 2016       68.94399  5.829634 132.0584 -27.581090 165.4691
## Oct 2016       68.94399  3.495314 134.3927 -31.151125 169.0391
df1_test$se <- 68.94

df1_test$se <- as.integer(df1_test$se)
df1_res_se <- mape(df1_test$se, df1_test_ts)*100

df1_res_se
## [1] 12.62788
autoplot(df1_se_model) +
  autolayer(fitted(df1_se_model), series="Fitted") +
  ylab("Revenue (millions of Dollars)") + xlab("Year")

df1_holt_model <- holt(df1,h=12)
df1_holt_model
##          Point Forecast      Lo 80     Hi 80      Lo 95    Hi 95
## Nov 2015       61.35600  28.501365  94.21064  11.109170 111.6028
## Dec 2015       60.84255  22.010077  99.67501   1.453410 120.2317
## Jan 2016       60.32909  16.321929 104.33625  -6.974054 127.6322
## Feb 2016       59.81563  11.179887 108.45137 -14.566320 134.1976
## Mar 2016       59.30217   6.440304 112.16404 -21.543078 140.1474
## Apr 2016       58.78871   2.013237 115.56419 -28.041884 145.6193
## May 2016       58.27526  -2.162055 118.71257 -34.155634 150.7061
## Jun 2016       57.76180  -6.128878 121.65247 -39.950558 155.4742
## Jul 2016       57.24834  -9.919394 124.41607 -45.475845 159.9725
## Aug 2016       56.73488 -13.558267 127.02803 -50.769212 164.2390
## Sep 2016       56.22142 -17.064900 129.50775 -55.860337 168.3032
## Oct 2016       55.70797 -20.454886 131.87082 -60.773064 172.1890
df1_holt_model$mean
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 2015                                                                        
## 2016 60.32909 59.81563 59.30217 58.78871 58.27526 57.76180 57.24834 56.73488
##           Sep      Oct      Nov      Dec
## 2015                   61.35600 60.84255
## 2016 56.22142 55.70797
df1_holt <- as.data.frame(df1_holt_model)

df1_test$holt <- df1_holt$`Point Forecast`
df1_res_holt <- mape(df1_test$holt, df1_test_ts)*100
## Warning in `-.default`(actual, predicted): longer object length is not a
## multiple of shorter object length
## Warning in `/.default`(ae(actual, predicted), abs(actual)): longer object length
## is not a multiple of shorter object length
df1_res_holt
## [1] 9.66441
df1_holt_model_damp <- holt(df1,damped = TRUE ,phi = 0.9,h=15)

df1_holt_model_damp
##          Point Forecast      Lo 80     Hi 80      Lo 95    Hi 95
## Nov 2015       62.03692  28.601883  95.47196  10.902442 113.1714
## Dec 2015       62.02293  22.724678 101.32118   1.921441 122.1244
## Jan 2016       62.01033  17.615345 106.40532  -5.885942 129.9066
## Feb 2016       61.99900  13.033927 110.96407 -12.886616 136.8846
## Mar 2016       61.98880   8.844352 115.13324 -19.288618 143.2662
## Apr 2016       61.97962   4.960606 118.99863 -25.223435 149.1827
## May 2016       61.97135   1.324206 122.61850 -30.780454 154.7232
## Jun 2016       61.96392  -2.106759 126.03459 -36.023725 159.9516
## Jul 2016       61.95722  -5.363545 129.27799 -41.001007 164.9155
## Aug 2016       61.95120  -8.470203 132.37260 -45.749041 169.6514
## Sep 2016       61.94578 -11.445707 135.33726 -50.296812 174.1884
## Oct 2016       61.94090 -14.305345 138.18714 -54.667667 178.5495
## Nov 2016       61.93651 -17.061646 140.93466 -58.880742 182.7538
## Dec 2016       61.93255 -19.725035 143.59014 -62.951951 186.8171
## Jan 2017       61.92900 -22.304293 146.16229 -66.894703 190.7527
autoplot(df1) +
  autolayer(df1_holt_model, series="Holt's method", PI=FALSE) +
  autolayer(df1_holt_model_damp, series="Damped Holt's method", PI=FALSE) +
  ggtitle("Revenue forecasts from Holt's method") + xlab("Year") +
  ylab("Revenue in Millions $)") +
  guides(colour=guide_legend(title="Forecast"))

library(DT)

Model_df1 <- c('Naive','Simple Exponential Smoothing','Holt Trend Method')

MAPE_df1 <- c(df1_res_naive,df1_res_se,df1_res_holt)

df1_sales <- data.frame(Model_df1, MAPE_df1)

datatable(df1_sales)
# 1 year Time Series Predictions of number of items sold
df2_train <- head(df2, length(df2) - 12)
df2_test <- tail(df2, 12)

df2_test
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2014                                         118 169
## 2015 111  84  82  78  72  64  63  66  73  71
df2_train_ts <- ts(df2_train, start=c(2013, 1), end=c(2015, 10), frequency=12)

df2_test_ts <- ts(df2_test,start=c(2013, 1), end=c(2016, 10), frequency=12)

df2_naive_mod <- naive(df2_train_ts, h = 12)
summary(df2_naive_mod)
## 
## Forecast method: Naive method
## 
## Model Information:
## Call: naive(y = df2_train_ts, h = 12) 
## 
## Residual sd: 22.1838 
## 
## Error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE      ACF1
## Training set 1.575758 22.18381 14.54545 -0.3224273 11.43132 0.5714286 -0.310782
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Nov 2015            183 154.57031 211.4297 139.52054 226.4795
## Dec 2015            183 142.79435 223.2057 121.51076 244.4892
## Jan 2016            183 133.75833 232.2417 107.69137 258.3086
## Feb 2016            183 126.14062 239.8594  96.04108 269.9589
## Mar 2016            183 119.42928 246.5707  85.77697 280.2230
## Apr 2016            183 113.36177 252.6382  76.49751 289.5025
## May 2016            183 107.78211 258.2179  67.96416 298.0358
## Jun 2016            183 102.58869 263.4113  60.02152 305.9785
## Jul 2016            183  97.71093 268.2891  52.56162 313.4384
## Aug 2016            183  93.09743 272.9026  45.50588 320.4941
## Sep 2016            183  88.70938 277.2906  38.79495 327.2051
## Oct 2016            183  84.51666 281.4833  32.38274 333.6173
df2_test$naive <- 133
## Warning in df2_test$naive <- 133: Coercing LHS to a list
df2_test$naive<- as.integer(df2_test$naive)
df2_res_naive <- mape(df2_test$naive, df2_test_ts) * 100
df2_res_naive
## [1] 38.34586
plot(df2,  main="Forecast for Yearly/Monthly Items Sold", xlab="Time", ylab="Thousands of Sales")

lines(df2_naive_mod$mean, col=4) #Naive method prediction

df2_se_model <- ses(df2_train_ts, h = 12)
summary(df2_se_model)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
##  ses(y = df2_train_ts, h = 12) 
## 
##   Smoothing parameters:
##     alpha = 0.4775 
## 
##   Initial states:
##     l = 129.5837 
## 
##   sigma:  20.59
## 
##      AIC     AICc      BIC 
## 329.5219 330.3219 334.1009 
## 
## Error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 1.541877 19.97525 14.39572 -0.61119 11.41915 0.5655461 0.01602038
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## Nov 2015       154.6137 128.2265 181.0009 114.25800 194.9694
## Dec 2015       154.6137 125.3732 183.8542 109.89415 199.3332
## Jan 2016       154.6137 122.7745 186.4529 105.91982 203.3076
## Feb 2016       154.6137 120.3725 188.8549 102.24626 206.9811
## Mar 2016       154.6137 118.1282 191.0991  98.81402 210.4134
## Apr 2016       154.6137 116.0143 193.2131  95.58100 213.6464
## May 2016       154.6137 114.0102 195.2171  92.51607 216.7113
## Jun 2016       154.6137 112.1006 197.1268  89.59546 219.6319
## Jul 2016       154.6137 110.2730 198.9543  86.80053 222.4269
## Aug 2016       154.6137 108.5179 200.7095  84.11631 225.1111
## Sep 2016       154.6137 106.8272 202.4001  81.53061 227.6968
## Oct 2016       154.6137 105.1943 204.0330  79.03333 230.1941
df2_test$se <- 120.97

df2_test$se <- as.integer(df2_test$se)
df2_res_se <- mape(df2_test$se, df2_test_ts)*100

df2_res_se
## [1] 33.55072
autoplot(df2_se_model) +
  autolayer(fitted(df2_se_model), series="Fitted") +
  ylab("Thousands of Sales") + xlab("Year")

df2_holt_model <- holt(df2,h=12)
df2_holt_model
##          Point Forecast      Lo 80     Hi 80      Lo 95    Hi 95
## Nov 2015       66.67013  38.143380  95.19687  23.042234 110.2980
## Dec 2015       64.75585  32.717167  96.79453  15.756913 113.7548
## Jan 2016       62.84157  27.638414  98.04473   9.002988 116.6802
## Feb 2016       60.92730  22.820462  99.03413   2.647923 119.2067
## Mar 2016       59.01302  18.207597  99.81844  -3.393489 121.4195
## Apr 2016       57.09874  13.761490 100.43599  -9.179865 123.3773
## May 2016       55.18447   9.454436 100.91450 -14.753580 125.1225
## Jun 2016       53.27019   5.265636 101.27474 -20.146440 126.6868
## Jul 2016       51.35591   1.179006 101.53282 -25.383045 128.0949
## Aug 2016       49.44164  -2.818197 101.70147 -30.482882 129.3662
## Sep 2016       47.52736  -6.736272 101.79099 -35.461703 130.5164
## Oct 2016       45.61308 -10.583684 101.80985 -40.332455 131.5586
df2_holt_model$mean
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 2015                                                                        
## 2016 62.84157 60.92730 59.01302 57.09874 55.18447 53.27019 51.35591 49.44164
##           Sep      Oct      Nov      Dec
## 2015                   66.67013 64.75585
## 2016 47.52736 45.61308
df2_holt <- as.data.frame(df2_holt_model)

df2_test$holt <- df2_holt$`Point Forecast`
df2_res_holt <- mape(df2_test$holt, df2_test_ts)*100
## Warning in `-.default`(actual, predicted): longer object length is not a
## multiple of shorter object length

## Warning in `-.default`(actual, predicted): longer object length is not a
## multiple of shorter object length
df2_res_holt
## [1] 53.63046
df2_holt_model_damp <- holt(df2,damped = TRUE ,phi = 0.9,h=15)

df2_holt_model_damp
##          Point Forecast     Lo 80    Hi 80      Lo 95    Hi 95
## Nov 2015       70.39713 41.088650  99.7056  25.573680 115.2206
## Dec 2015       70.33203 36.665924 103.9981  18.844164 121.8199
## Jan 2016       70.27344 32.751379 107.7955  12.888398 127.6585
## Feb 2016       70.22072 29.202735 111.2387   7.489128 132.9523
## Mar 2016       70.17326 25.934049 114.4125   2.515225 137.8313
## Apr 2016       70.13055 22.888638 117.3725  -2.119719 142.3808
## May 2016       70.09211 20.026751 120.1575  -6.476251 146.6605
## Jun 2016       70.05752 17.319211 122.7958 -10.598763 150.7138
## Jul 2016       70.02638 14.743841 125.3089 -14.520970 154.5737
## Aug 2016       69.99836 12.283306 127.7134 -18.269198 158.2659
## Sep 2016       69.97314  9.923749 130.0225 -21.864479 161.8108
## Oct 2016       69.95044  7.653879 132.2470 -25.323930 165.2248
## Nov 2016       69.93002  5.464352 134.3957 -28.661710 168.5217
## Dec 2016       69.91163  3.347327 136.4759 -31.889687 171.7129
## Jan 2017       69.89509  1.296153 138.4940 -35.017928 174.8081
autoplot(df2) +
  autolayer(df2_holt_model, series="Holt's method", PI=FALSE) +
  autolayer(df2_holt_model_damp, series="Damped Holt's method", PI=FALSE) +
  ggtitle("Sales forecasts from Holt's method") + xlab("Year") +
  ylab("Thousands of Sales") +
  guides(colour=guide_legend(title="Forecast"))

Model_df2 <- c('Naive','Simple Exponential Smoothing','Holt Trend Method')

MAPE_df2 <- c(df2_res_naive,df2_res_se,df2_res_holt)

df2_sales <- data.frame(Model_df2, MAPE_df2)

datatable(df2_sales)
summary(lgb_model_sales)
## LightGBM Model (100 trees)
## Objective: regression
## Fitted to dataset with 2 columns